# Draw a tree from a gene presence/absence csv file

library(data.table)
library(taxize)
library(taxizedb)
library(ape)
library(ggtree)

#db_download_ncbi()

fname <- "vertebrate_DNase.csv"
csv <- as.data.frame(fread(file=fname,sep="\t", header = T, fill=T))

fam <- names(csv)[4:length(names(csv))]
csv[,1] <- gsub('(\\d+)_.*','\\1', csv[,1]) #account for modified taxids

################################################
# Calculte tree from taxonomic classification  #
################################################
cl <- taxizedb::classification(csv[,1], db='ncbi')
tr_cl <- class2tree(cl)
tr <- tr_cl$phylo #tree
#dat <- tr_cl$classification #associated data
tid <- tr_cl$names #associated taxid

##############################
# presence/absence dataframe #
##############################
g <- csv[match(tid, csv[,1]) , match(fam, colnames(csv))]
g <- as.data.frame(apply(g,2, function (x) lengths(strsplit(x,','))))
g[g>1] <- 1 #comment to have actual gene numbers
g[] <- lapply(g, factor)
row.names(g) <- tr$tip.label

#################################
# Draw tree and assiociated map #
#################################
t <- tr

#adapting plot to tree size
xlim <- max(lengths(nodepath(t))) * 1
wg <- NCOL(g) / 8
w <- xlim + 20
h <- Ntip(t) / 1.8
tip_off <- wg + xlim + 4

# plot tree
p <- ggtree(t, branch.length='none')+
  geom_tiplab(size=1.5, offset=tip_off, align=TRUE, linetype='dashed', linesize=0.1) +
  geom_nodelab(geom='label', size=3.5) +
  xlim_tree(xlim=c(NA, xlim)) +
  theme(plot.margin=unit(c(1,1,1,0.5), "cm"))


p2 <- gheatmap(p, g, offset=-0.8, width=wg, colnames_position = "top", font.size=3.5, color="white", colnames_offset_y=+0.2, legend_title = '')

p2 <- p2 + scale_fill_manual(values=c("gray88","gray40"))

print(p2)

name = paste0(fname,".ggtree.pdf")
ggsave(name, width = w, height = h, units = "cm", limitsize = F)
